##load the dataset
library(ggplot2)
library(ggthemes)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
plot<-ggplot(data=bikeshare, aes(x = temp, y = count))
plot + geom_point(aes(color= temp, alpha = 0.6))
#convert datetime into POSIXct before plotting
bikeshare$DateTime<-as.POSIXct(paste(bikeshare$datetime, format="%Y%m%d %H%M%S"))
class(bikeshare$DateTime)
## [1] "POSIXct" "POSIXt"
# new plot
plot2<-ggplot(data=bikeshare, aes(x = DateTime, y = count))
plot2 + geom_point(aes(color= temp, alpha = 0.6)) +
scale_colour_gradient(high="pink", low="light green")
##CORRELATION BETWEEN TEMP AND COUNT
cor(bikeshare$temp, bikeshare$count, method = "pearson", use = "complete.obs")
## [1] 0.3944536
plot3<- ggplot(bikeshare, aes(factor(season), count))
plot3 + geom_boxplot(aes(fill = factor(season)))
bikeshare$hour<- sapply(bikeshare$DateTime, function(x) {format(x, "%H")})
head(bikeshare)
## # A tibble: 6 x 14
## datetime season holiday workingday weather temp atemp humidity
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2011-01-01 00:00:00 1 0 0 1 9.84 14.4 81
## 2 2011-01-01 01:00:00 1 0 0 1 9.02 13.6 80
## 3 2011-01-01 02:00:00 1 0 0 1 9.02 13.6 80
## 4 2011-01-01 03:00:00 1 0 0 1 9.84 14.4 75
## 5 2011-01-01 04:00:00 1 0 0 1 9.84 14.4 75
## 6 2011-01-01 05:00:00 1 0 0 2 9.84 12.9 75
## # … with 6 more variables: windspeed <dbl>, casual <dbl>, registered <dbl>,
## # count <dbl>, DateTime <dttm>, hour <chr>
#subsetting to workingday==1
bike_w1<-bikeshare [ which(bikeshare$workingday == 1), ]
plot4<-ggplot(data=bike_w1, aes(x = hour, y = count))
plot4 + geom_point(aes(color= temp, alpha = 0.6), position =position_jitter(w=1, h=0)) +
scale_colour_gradient(high="orange", low="dark blue")
bike_wk<-bikeshare [ which(bikeshare$workingday == 0), ]
plot5<-ggplot(data=bike_wk, aes(x = hour, y = count))
plot5 + geom_point(aes(color= temp, alpha = 0.6), position =position_jitter(w=1, h=0)) +
scale_colour_gradient(high="dark blue", low="green")
library(caTools)
set.seed(101)
sample<-sample.split(bikeshare$count, SplitRatio = 0.7)
train = subset (bikeshare, sample == TRUE)
test = subset (bikeshare, sample == FALSE)
# predict count based sorely on the temp feature
train.model<-lm(formula = count ~ factor(season) + workingday + holiday + hour + temp + casual, data = train)
summary(train.model)
##
## Call:
## lm(formula = count ~ factor(season) + workingday + holiday +
## hour + temp + casual, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -382.92 -45.23 -9.58 43.14 449.95
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -73.26322 5.83746 -12.551 < 2e-16 ***
## factor(season)2 12.15086 3.64732 3.331 0.000868 ***
## factor(season)3 10.61185 4.63163 2.291 0.021981 *
## factor(season)4 50.20128 2.92276 17.176 < 2e-16 ***
## workingday 80.85236 2.42565 33.332 < 2e-16 ***
## holiday 24.70403 6.13691 4.025 5.74e-05 ***
## hour01 -14.22644 6.76528 -2.103 0.035511 *
## hour02 -20.08635 6.86078 -2.928 0.003425 **
## hour03 -28.38097 6.88698 -4.121 3.81e-05 ***
## hour04 -29.15497 6.82029 -4.275 1.94e-05 ***
## hour05 -15.63559 6.77813 -2.307 0.021094 *
## hour06 33.87405 6.76304 5.009 5.60e-07 ***
## hour07 157.48062 6.74992 23.331 < 2e-16 ***
## hour08 295.91529 6.82217 43.376 < 2e-16 ***
## hour09 120.01776 6.83367 17.563 < 2e-16 ***
## hour10 35.84389 6.94032 5.165 2.47e-07 ***
## hour11 40.97710 6.92076 5.921 3.34e-09 ***
## hour12 64.30936 7.01301 9.170 < 2e-16 ***
## hour13 52.53715 6.99093 7.515 6.34e-14 ***
## hour14 32.55945 7.13710 4.562 5.15e-06 ***
## hour15 43.00719 7.03676 6.112 1.03e-09 ***
## hour16 109.16193 7.03633 15.514 < 2e-16 ***
## hour17 268.08535 7.07382 37.898 < 2e-16 ***
## hour18 259.23718 6.97918 37.144 < 2e-16 ***
## hour19 164.64639 6.78447 24.268 < 2e-16 ***
## hour20 105.46635 6.70103 15.739 < 2e-16 ***
## hour21 72.99693 6.78025 10.766 < 2e-16 ***
## hour22 49.27957 6.75445 7.296 3.27e-13 ***
## hour23 21.97105 6.82064 3.221 0.001282 **
## temp 1.80277 0.23559 7.652 2.22e-14 ***
## casual 2.20688 0.02894 76.261 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 85.83 on 7606 degrees of freedom
## Multiple R-squared: 0.781, Adjusted R-squared: 0.7802
## F-statistic: 904.2 on 30 and 7606 DF, p-value: < 2.2e-16
res<-residuals(train.model)
res<- as.data.frame(res)
##GGPLOT
ggplot(res, aes(res) )+ geom_histogram(fill = 'blue', alpha=0.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot(train.model)
##PREDICTIONS
test.predictions<- predict(train.model, test)
##MSE
results<-cbind(test.predictions, test$count)
colnames(results)<-c('pred', 'real')
results<- as.data.frame(results)
#calcuate MSE
mse <-mean((results$real-results$pred)^2)
print(mse)
## [1] 7181.107
mse^0.5
## [1] 84.74141
SSE =sum((results$pred - results$real)^2)
SST = sum( (mean(bikeshare$count)- results$real)^2)
R2 = 1 - SSE/SST
print(R2)
## [1] 0.7696607